ts object for
time serieswindowIf you get an error go to the packages panel, click install and type the name of the library (you only need to do this once).
library(fpp2)
library(tidyverse)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
fdata <- read_csv("Unemployment.dat")
## Rows: 156 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): DATE
## dbl (1): TOTAL
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(fdata)
## # A tibble: 6 × 2
## DATE TOTAL
## <chr> <dbl>
## 1 01/01/2010 4048493
## 2 01/02/2010 4130625
## 3 01/03/2010 4166613
## 4 01/04/2010 4142425
## 5 01/05/2010 4066202
## 6 01/06/2010 3982368
glimpse(fdata)
## Rows: 156
## Columns: 2
## $ DATE <chr> "01/01/2010", "01/02/2010", "01/03/2010", "01/04/2010", "01/05/2…
## $ TOTAL <dbl> 4048493, 4130625, 4166613, 4142425, 4066202, 3982368, 3908578, 3…
fdata$DATE <- as.Date(fdata$DATE, format = "%d/%m/%Y")
head(fdata)
## # A tibble: 6 × 2
## DATE TOTAL
## <date> <dbl>
## 1 2010-01-01 4048493
## 2 2010-02-01 4130625
## 3 2010-03-01 4166613
## 4 2010-04-01 4142425
## 5 2010-05-01 4066202
## 6 2010-06-01 3982368
Order the table by date (using the pipe operator)
fdata <- fdata %>% arrange(DATE)
# same as: fdata <- arrange(fdata,DATE)
How do we know if there are time gaps in the data?
range(fdata$DATE)
## [1] "2010-01-01" "2022-12-01"
min(fdata$DATE)
## [1] "2010-01-01"
max(fdata$DATE)
## [1] "2022-12-01"
Therefore we can create a complete sequence of months with the same range and compare it to the dates in our data.
date_range <- seq.Date(min(fdata$DATE), max(fdata$DATE), by = "months")
head(date_range)
## [1] "2010-01-01" "2010-02-01" "2010-03-01" "2010-04-01" "2010-05-01"
## [6] "2010-06-01"
tail(date_range)
## [1] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [6] "2022-12-01"
Now we do the comparison
date_range[!date_range %in% fdata$DATE]
## Date of length 0
To practice this, let us create a new dataset missing one date.
fdata2 <- fdata[c(1:4, 6:nrow(fdata)), ]
head(fdata2)
## # A tibble: 6 × 2
## DATE TOTAL
## <date> <dbl>
## 1 2010-01-01 4048493
## 2 2010-02-01 4130625
## 3 2010-03-01 4166613
## 4 2010-04-01 4142425
## 5 2010-06-01 3982368
## 6 2010-07-01 3908578
Now if you repeat the comparison
date_range[!(date_range %in% fdata2$DATE)]
## [1] "2010-05-01"
sum(is.na(fdata$TOTAL))
## [1] 0
Again, let us see an example with missing data
fdata2 <- fdata
fdata2$TOTAL[5] = NA
head(fdata2)
## # A tibble: 6 × 2
## DATE TOTAL
## <date> <dbl>
## 1 2010-01-01 4048493
## 2 2010-02-01 4130625
## 3 2010-03-01 4166613
## 4 2010-04-01 4142425
## 5 2010-05-01 NA
## 6 2010-06-01 3982368
and now
sum(is.na(fdata2$TOTAL))
## [1] 1
ts object for time seriesWe will now convert the table to a time series object. We need to provide the column (or columns) containing the values and we also need to describe the time indexing. To do this we provide the starting date and the number of observations per unit of time. The starting date is provided as a pair (year, month), or (year, quarter),
The single most important question is: what is the frequency of your data? see Hyndman FPP, The seasonal period in Section 2.1
# start -> year and month
# frequency = 12 -> monthly data
# frequency = 4 -> quarterly data
Now we create the ts object and display the first and last values
y <- ts(fdata$TOTAL, start = c(2010, 1), frequency = 12)
head(y, 30)
## Jan Feb Mar Apr May Jun Jul Aug Sep
## 2010 4048493 4130625 4166613 4142425 4066202 3982368 3908578 3969661 4017763
## 2011 4231003 4299263 4333669 4269360 4189659 4121801 4079742 4130927 4226744
## 2012 4599829 4712098 4750867 4744235 4714122 4615269
## Oct Nov Dec
## 2010 4085976 4110294 4100073
## 2011 4360926 4420462 4422359
## 2012
tail(y, 12)
## Jan Feb Mar Apr May Jun Jul Aug Sep
## 2022 3123078 3111684 3108763 3022503 2922991 2880582 2883812 2924240 2941919
## Oct Nov Dec
## 2022 2914892 2881380 2837653
The most basic and useful plot for time series is the time plot:
autoplot(y) +
ggtitle("Unemployment in Spain") +
xlab("Year") + ylab("Number unemployed")
or using the R base plotting system:
plot.ts(y,
main="Unemployment in Spain",
xlab="Year",
ylab="Number unemployed")
window# Select time series time frame
y2 <- y
y <- window(y, start = c(2010,1), end = c(2019,12))
# or directly from the original data:
y <- ts(fdata$TOTAL, start = c(2010,1), end = c(2019,12), frequency = 12)
Now we plot the time series
autoplot(y2, color = "blue") +
ggtitle("Unemployment in Spain") +
xlab("Year") + ylab("Number unemployed") +
autolayer(y, color = "red")
y_dec_add <- decompose(y, type="additive")
The decomposition object is a list that contains (among other things) the components. To get e.g. the seasonal component:
head(y_dec_add$seasonal, 20)
## Jan Feb Mar Apr May
## 2010 108216.0413 140371.9024 127003.0367 53488.8191 -31135.3893
## 2011 108216.0413 140371.9024 127003.0367 53488.8191 -31135.3893
## Jun Jul Aug Sep Oct
## 2010 -121222.6022 -157327.1485 -117894.5328 -69106.8198 21074.8654
## 2011 -121222.6022 -157327.1485 -117894.5328
## Nov Dec
## 2010 46194.2820 337.5459
## 2011
Let us visualize the decomposition
autoplot(y_dec_add) + xlab("Year") +
ggtitle("Classical additive decomposition")
Similarly
y_dec_mult <- decompose(y, type="multiplicative")
autoplot(y_dec_mult) + xlab("Year") +
ggtitle("Classical multiplicative decomposition")
library(seasonal)
##
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
##
## view
We obtain the decomposition as follows:
y_dec_seas <- seas(y)
In this case the components are stored in a mts (multi time series) object
head(y_dec_seas$data)
## final seasonal seasonaladj trend irregular adjustfac
## Jan 2010 3963891 1.0213433 3963891 3964972 0.9997271 1.0213433
## Feb 2010 3989900 1.0352703 3989900 3990377 0.9998804 1.0352703
## Mar 2010 4017108 1.0381748 4017108 4016480 1.0001565 1.0372171
## Apr 2010 4040293 1.0243325 4040293 4039090 1.0002978 1.0252783
## May 2010 4056567 1.0023751 4056567 4056696 0.9999683 1.0023751
## Jun 2010 4071217 0.9781762 4071217 4067370 1.0009459 0.9781762
But we can use seasonal(), trendcycle() and
remainder() functions to extract the individual components.
With seasadj() we can compute the seasonally adjusted time
series.
autoplot(y_dec_seas) + xlab("Year") +
ggtitle("SEATS decomposition")
autoplot(seasonal(y_dec_mult), series = "Multiplicative") +
forecast::autolayer(seasonal(y_dec_seas), series = "SEATS")
autoplot(seasadj(y_dec_add), series = "Additive") +
forecast::autolayer(seasadj(y_dec_mult), series = "Multiplicative") +
forecast::autolayer(seasadj(y_dec_seas),series = "SEATS")
autoplot(seasadj(y_dec_seas), series = "SEATS")
Can be obtained from the decompositions as follows:
ggsubseriesplot(seasonal(y_dec_add))
ggsubseriesplot(seasonal(y_dec_mult))
ggsubseriesplot(seasonal(y_dec_seas))
ggseasoonplotggseasonplot(y, year.labels=TRUE,continuous=TRUE)
library(TSstudio)
Let us load this dataset (run ?US_indicators after
loading the data)
data(US_indicators)
Examine the data
glimpse(US_indicators)
## Rows: 528
## Columns: 3
## $ Date <date> 1976-01-31, 1976-02-29, 1976-03-31, 1976-04-30, 1…
## $ `Vehicle Sales` <dbl> 885.2, 994.7, 1243.6, 1191.2, 1203.2, 1254.7, 1162…
## $ `Unemployment Rate` <dbl> 8.8, 8.7, 8.1, 7.4, 6.8, 8.0, 7.8, 7.6, 7.4, 7.2, …
str(US_indicators)
## 'data.frame': 528 obs. of 3 variables:
## $ Date : Date, format: "1976-01-31" "1976-02-29" ...
## $ Vehicle Sales : num 885 995 1244 1191 1203 ...
## $ Unemployment Rate: num 8.8 8.7 8.1 7.4 6.8 8 7.8 7.6 7.4 7.2 ...
head(US_indicators)
## Date Vehicle Sales Unemployment Rate
## 1 1976-01-31 885.2 8.8
## 2 1976-02-29 994.7 8.7
## 3 1976-03-31 1243.6 8.1
## 4 1976-04-30 1191.2 7.4
## 5 1976-05-31 1203.2 6.8
## 6 1976-06-30 1254.7 8.0
tail(US_indicators)
## Date Vehicle Sales Unemployment Rate
## 523 2019-07-31 1443.947 4.0
## 524 2019-08-31 1685.342 3.8
## 525 2019-09-30 1315.632 3.3
## 526 2019-10-31 1380.180 3.3
## 527 2019-11-30 1446.483 3.3
## 528 2019-12-31 1565.023 3.4
Rename variables and order dates
US_indicators <- US_indicators %>%
rename( VehicleSales = 'Vehicle Sales', UnemploymentRate= `Unemployment Rate`) %>%
arrange(Date)
head(US_indicators)
## Date VehicleSales UnemploymentRate
## 1 1976-01-31 885.2 8.8
## 2 1976-02-29 994.7 8.7
## 3 1976-03-31 1243.6 8.1
## 4 1976-04-30 1191.2 7.4
## 5 1976-05-31 1203.2 6.8
## 6 1976-06-30 1254.7 8.0
tail(US_indicators)
## Date VehicleSales UnemploymentRate
## 523 2019-07-31 1443.947 4.0
## 524 2019-08-31 1685.342 3.8
## 525 2019-09-30 1315.632 3.3
## 526 2019-10-31 1380.180 3.3
## 527 2019-11-30 1446.483 3.3
## 528 2019-12-31 1565.023 3.4
Check for complete dates and data
We will try to use the same strategy, but have you noticed any difference with the previous case? Look at the dates.
lubridate::day(US_indicators$Date) %>% head(20)
## [1] 31 29 31 30 31 30 31 31 30 31 30 31 31 28 31 30 31 30 31 31
This complicates the strategy of creating a complete date sequence
and comparing it with the dates in our data. Since this is monthly data
and month is all we care about, we replace all dates with the first day
of the month (run ?lubridate::day).
lubridate::day(US_indicators$Date) <- 1
head(US_indicators)
## Date VehicleSales UnemploymentRate
## 1 1976-01-01 885.2 8.8
## 2 1976-02-01 994.7 8.7
## 3 1976-03-01 1243.6 8.1
## 4 1976-04-01 1191.2 7.4
## 5 1976-05-01 1203.2 6.8
## 6 1976-06-01 1254.7 8.0
Now we proceed as before
date_range <- seq.Date(min(US_indicators$Date), max(US_indicators$Date), by = "months")
head(date_range)
## [1] "1976-01-01" "1976-02-01" "1976-03-01" "1976-04-01" "1976-05-01"
## [6] "1976-06-01"
tail(date_range)
## [1] "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01" "2019-11-01"
## [6] "2019-12-01"
date_range[!date_range %in% US_indicators$Date]
## Date of length 0
Check for missing data in the value columns
sum(is.na(US_indicators$VehicleSales))
## [1] 0
sum(is.na(US_indicators$UnemploymentRate))
## [1] 0
Alternatively
all(complete.cases(US_indicators))
## [1] TRUE
tvs <- US_indicators[, c("Date", "VehicleSales")]
str(tvs)
## 'data.frame': 528 obs. of 2 variables:
## $ Date : Date, format: "1976-01-01" "1976-02-01" ...
## $ VehicleSales: num 885 995 1244 1191 1203 ...
We create the ts object as we did before:
library(lubridate)
start_point <- c(year(min(tvs$Date)), month(min(tvs$Date)))
start_point
## [1] 1976 1
tvs_ts <- ts(data = tvs$'VehicleSales',
start = start_point,
frequency = 12)
and we do a basic time plot
plot.ts(tvs_ts,
main = "US Monthly Total Vehicle Sales",
ylab = "Thousands of Vehicle",
xlab = "Time"
)
We create it in a similar way:
US_indicators_ts <- ts(data = US_indicators[, c("VehicleSales",
"UnemploymentRate")],
start = c(year(min(tvs$Date)),
month(min(tvs$Date))),
frequency = 12)
str(US_indicators_ts)
## Time-Series [1:528, 1:2] from 1976 to 2020: 885 995 1244 1191 1203 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "VehicleSales" "UnemploymentRate"
Change plot type to single and see what happens. When
would you use one or the other?
plot.ts(US_indicators_ts,
plot.type = "multiple",
main = "US Monthly Vehicle Sales vs. Unemployment Rate",
xlab = "Time")
ts_plot(tvs_ts,
title = "US Monthly Total Vehicle Sales",
Ytitle = "Thousands of Vehicle",
slider = TRUE
)
ts_plot(US_indicators_ts,
title = "US Monthly Vehicle Sales vs. Unemployment Rate",
type = "multiple")
head(US_indicators)
## Date VehicleSales UnemploymentRate
## 1 1976-01-01 885.2 8.8
## 2 1976-02-01 994.7 8.7
## 3 1976-03-01 1243.6 8.1
## 4 1976-04-01 1191.2 7.4
## 5 1976-05-01 1203.2 6.8
## 6 1976-06-01 1254.7 8.0
tvs_data <- US_indicators[,-3]
head(tvs_data)
## Date VehicleSales
## 1 1976-01-01 885.2
## 2 1976-02-01 994.7
## 3 1976-03-01 1243.6
## 4 1976-04-01 1191.2
## 5 1976-05-01 1203.2
## 6 1976-06-01 1254.7
tvs_data$MA5 <- slider::slide_dbl(tvs$VehicleSales, mean,
.before = 2, .after = 2, .complete = TRUE)
head(tvs_data, 15)
## Date VehicleSales MA5
## 1 1976-01-01 885.2 NA
## 2 1976-02-01 994.7 NA
## 3 1976-03-01 1243.6 1103.58
## 4 1976-04-01 1191.2 1177.48
## 5 1976-05-01 1203.2 1211.00
## 6 1976-06-01 1254.7 1167.50
## 7 1976-07-01 1162.3 1140.84
## 8 1976-08-01 1026.1 1126.08
## 9 1976-09-01 1057.9 1092.02
## 10 1976-10-01 1129.4 1071.92
## 11 1976-11-01 1084.4 1060.68
## 12 1976-12-01 1061.8 1067.52
## 13 1977-01-01 969.9 1131.86
## 14 1977-02-01 1092.1 1185.86
## 15 1977-03-01 1451.1 1248.92
tail(tvs_data)
## Date VehicleSales MA5
## 523 2019-07-01 1443.947 1525.549
## 524 2019-08-01 1685.342 1475.970
## 525 2019-09-01 1315.632 1454.317
## 526 2019-10-01 1380.180 1478.532
## 527 2019-11-01 1446.483 NA
## 528 2019-12-01 1565.023 NA
tvs_MA_ts <- ts(data = tvs_data[,-1],
start = c(year(min(tvs$Date)),
month(min(tvs$Date))),
frequency = 12)
autoplot(tvs_MA_ts)
tvs_data$MA12 <- slider::slide_dbl(tvs$VehicleSales, mean,
.before = 5, .after = 6, .complete = TRUE)
tvs_data$MA2x12 <- slider::slide_dbl(tvs_data$MA12, mean,
.before = 1, .after = 0, .complete = TRUE)
head(tvs_data)
## Date VehicleSales MA5 MA12 MA2x12
## 1 1976-01-01 885.2 NA NA NA
## 2 1976-02-01 994.7 NA NA NA
## 3 1976-03-01 1243.6 1103.58 NA NA
## 4 1976-04-01 1191.2 1177.48 NA NA
## 5 1976-05-01 1203.2 1211.00 NA NA
## 6 1976-06-01 1254.7 1167.50 1107.875 NA
tvs_MA_ts <- ts(data = tvs_data[,-1],
start = c(year(min(tvs$Date)),
month(min(tvs$Date))),
frequency = 12)
head(tvs_MA_ts)
## VehicleSales MA5 MA12 MA2x12
## Jan 1976 885.2 NA NA NA
## Feb 1976 994.7 NA NA NA
## Mar 1976 1243.6 1103.58 NA NA
## Apr 1976 1191.2 1177.48 NA NA
## May 1976 1203.2 1211.00 NA NA
## Jun 1976 1254.7 1167.50 1107.875 NA
autoplot(tvs_MA_ts[ , 4], color = "yellow", size = 3, alpha = 0.5) +
autolayer(decompose(tvs_ts, type="additive")$trend)
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).